SAT Scores Analysis

Posted on September 07, 2016 in posts

In [1]:
import pandas
import numpy as np

files = ["ap_2010.csv", "class_size.csv", "demographics.csv", "graduation.csv", "hs_directory.csv", "math_test_results.csv", "sat_results.csv"]

data = {}
for f in files:
    d = pandas.read_csv("schools/{0}".format(f))
    data[f.replace(".csv", "")] = d
In [2]:
for k,v in data.items():
    print("\n" + k + "\n")
    #print(v.head())
sat_results


hs_directory


math_test_results


ap_2010


class_size


demographics


graduation

In [3]:
#data["demographics"]["DBN"].head()
In [4]:
#data["class_size"].head()
In [5]:
data["class_size"]["DBN"] = data["class_size"].apply(lambda x: "{0:02d}{1}".format(x["CSD"], x["SCHOOL CODE"]), axis=1)
data["hs_directory"]["DBN"] = data["hs_directory"]["dbn"]
In [6]:
survey1 = pandas.read_csv("schools/survey_all.txt", delimiter="\t", encoding='windows-1252')
survey2 = pandas.read_csv("schools/survey_d75.txt", delimiter="\t", encoding='windows-1252')
survey1["d75"] = False
survey2["d75"] = True
survey = pandas.concat([survey1, survey2], axis=0)
In [7]:
#survey.head()
In [8]:
survey["DBN"] = survey["dbn"]
survey_fields = ["DBN", "rr_s", "rr_t", "rr_p", "N_s", "N_t", "N_p", "saf_p_11", "com_p_11", "eng_p_11", "aca_p_11", "saf_t_11", "com_t_11", "eng_t_10", "aca_t_11", "saf_s_11", "com_s_11", "eng_s_11", "aca_s_11", "saf_tot_11", "com_tot_11", "eng_tot_11", "aca_tot_11",]
survey = survey.loc[:,survey_fields]
data["survey"] = survey
survey.shape
Out[8]:
(1702, 23)
In [9]:
#data["class_size"].head()
In [10]:
#data["sat_results"].head()
In [11]:
class_size = data["class_size"]
class_size = class_size[class_size["GRADE "] == "09-12"]
class_size = class_size[class_size["PROGRAM TYPE"] == "GEN ED"]
class_size = class_size.groupby("DBN").agg(np.mean)
class_size.reset_index(inplace=True)
data["class_size"] = class_size
In [12]:
demographics = data["demographics"]
demographics = demographics[demographics["schoolyear"] == 20112012]
data["demographics"] = demographics
In [13]:
data["math_test_results"] = data["math_test_results"][data["math_test_results"]["Year"] == 2011]
data["math_test_results"] = data["math_test_results"][data["math_test_results"]["Grade"] == '8']
In [14]:
data["graduation"] = data["graduation"][data["graduation"]["Cohort"] == "2006"]
data["graduation"] = data["graduation"][data["graduation"]["Demographic"] == "Total Cohort"]
In [15]:
cols = ['SAT Math Avg. Score', 'SAT Critical Reading Avg. Score', 'SAT Writing Avg. Score']
for c in cols:
    data["sat_results"][c] = data["sat_results"][c].convert_objects(convert_numeric=True)

data['sat_results']['sat_score'] = data['sat_results'][cols[0]] + data['sat_results'][cols[1]] + data['sat_results'][cols[2]]
C:\Anaconda3\lib\site-packages\ipykernel\__main__.py:3: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  app.launch_new_instance()
In [16]:
data["hs_directory"]['lat'] = data["hs_directory"]['Location 1'].apply(lambda x: x.split("\n")[-1].replace("(", "").replace(")", "").split(", ")[0])
data["hs_directory"]['lon'] = data["hs_directory"]['Location 1'].apply(lambda x: x.split("\n")[-1].replace("(", "").replace(")", "").split(", ")[1])

for c in ['lat', 'lon']:
    data["hs_directory"][c] = data["hs_directory"][c].convert_objects(convert_numeric=True)
C:\Anaconda3\lib\site-packages\ipykernel\__main__.py:5: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
In [17]:
for k,v in data.items():
    print(k)
    #print(v.head())
sat_results
hs_directory
math_test_results
survey
ap_2010
class_size
demographics
graduation
In [18]:
flat_data_names = [k for k,v in data.items()]
flat_data = [data[k] for k in flat_data_names]
full = flat_data[0]
for i, f in enumerate(flat_data[1:]):
    name = flat_data_names[i+1]
    print(name)
    print(len(f["DBN"]) - len(f["DBN"].unique()))
    join_type = "inner"
    if name in ["sat_results", "ap_2010", "graduation"]:
        join_type = "outer"
    if name not in ["math_test_results"]:
        full = full.merge(f, on="DBN", how=join_type)

full.shape
hs_directory
0
math_test_results
0
survey
0
ap_2010
1
class_size
0
demographics
0
graduation
0
Out[18]:
(459, 159)
In [19]:
cols = ['AP Test Takers ', 'Total Exams Taken', 'Number of Exams with scores 3 4 or 5']

for col in cols:
    full[col] = full[col].convert_objects(convert_numeric=True)

full[cols] = full[cols].fillna(value=0)
C:\Anaconda3\lib\site-packages\ipykernel\__main__.py:4: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
In [20]:
full["school_dist"] = full["DBN"].apply(lambda x: x[:2])
In [21]:
full = full.fillna(full.mean())
In [22]:
full.corr()['sat_score']
Out[22]:
SAT Critical Reading Avg. Score         0.986820
SAT Math Avg. Score                     0.972643
SAT Writing Avg. Score                  0.987771
sat_score                               1.000000
grade_span_max                               NaN
expgrade_span_max                            NaN
zip                                    -0.063977
total_students                          0.407827
number_programs                         0.117012
lat                                    -0.121029
lon                                    -0.132222
rr_s                                    0.232199
rr_t                                   -0.023386
rr_p                                    0.047925
N_s                                     0.423463
N_t                                     0.291463
N_p                                     0.421530
saf_p_11                                0.122913
com_p_11                               -0.115073
eng_p_11                                0.020254
aca_p_11                                0.035155
saf_t_11                                0.313810
com_t_11                                0.082419
eng_t_10                                     NaN
aca_t_11                                0.132348
saf_s_11                                0.337639
com_s_11                                0.187370
eng_s_11                                0.213822
aca_s_11                                0.339435
saf_tot_11                              0.318753
                                          ...   
AP Test Takers                          0.562177
Total Exams Taken                       0.553398
Number of Exams with scores 3 4 or 5    0.559260
CSD                                     0.041645
NUMBER OF STUDENTS / SEATS FILLED       0.389864
NUMBER OF SECTIONS                      0.356769
AVERAGE CLASS SIZE                      0.372462
SIZE OF SMALLEST CLASS                  0.240140
SIZE OF LARGEST CLASS                   0.307143
SCHOOLWIDE PUPIL-TEACHER RATIO               NaN
schoolyear                                   NaN
frl_percent                            -0.706449
total_enrollment                        0.363924
ell_num                                -0.149862
ell_percent                            -0.386465
sped_num                                0.034445
sped_percent                           -0.432402
asian_num                               0.473049
asian_per                               0.544016
black_num                               0.027202
black_per                              -0.276387
hispanic_num                            0.025273
hispanic_per                           -0.384876
white_num                               0.447907
white_per                               0.615877
male_num                                0.322530
male_per                               -0.107994
female_num                              0.383790
female_per                              0.108037
Total Cohort                            0.308939
Name: sat_score, dtype: float64
In [23]:
import folium
from folium import plugins

schools_map = folium.Map(location=[full['lat'].mean(), full['lon'].mean()], zoom_start=10)
marker_cluster = folium.MarkerCluster().add_to(schools_map)
for name, row in full.iterrows():
    folium.Marker([row["lat"], row["lon"]], popup="{0}: {1}".format(row["DBN"], row["school_name"])).add_to(marker_cluster)
schools_map.create_map('schools.html')
schools_map
C:\Anaconda3\lib\site-packages\ipykernel\__main__.py:8: FutureWarning: Map.create_map is deprecated. Use Map.save instead
Out[23]:
In [24]:
schools_heatmap = folium.Map(location=[full['lat'].mean(), full['lon'].mean()], zoom_start=10)
schools_heatmap.add_children(plugins.HeatMap([[row["lat"], row["lon"]] for name, row in full.iterrows()]))
schools_heatmap.save("heatmap.html")
schools_heatmap
Out[24]:
In [25]:
district_data = full.groupby("school_dist").agg(np.mean)
district_data.reset_index(inplace=True)
district_data["school_dist"] = district_data["school_dist"].apply(lambda x: str(int(x)))
In [26]:
def show_district_map(col):
    geo_path = 'schools/districts.geojson'
    districts = folium.Map(location=[full['lat'].mean(), full['lon'].mean()], zoom_start=10)
    districts.geo_json(
        geo_path=geo_path,
        data=district_data,
        columns=['school_dist', col],
        key_on='feature.properties.school_dist',
        fill_color='YlGn',
        fill_opacity=0.7,
        line_opacity=0.2,
    )
    districts.save("districts.html")
    return districts

show_district_map("sat_score")
C:\Anaconda3\lib\site-packages\folium\folium.py:504: UserWarning: This method is deprecated. Please use Map.choropleth instead.
  warnings.warn('This method is deprecated. '
C:\Anaconda3\lib\site-packages\folium\folium.py:506: FutureWarning: 'threshold_scale' default behavior has changed. Now you get a linear scale between the 'min' and the 'max' of your data. To get former behavior, use folium.utilities.split_six.
  return self.choropleth(*args, **kwargs)
Out[26]:
In [27]:
%matplotlib inline

full.plot.scatter(x='total_enrollment', y='sat_score')
Out[27]:
In [28]:
full[(full["total_enrollment"] < 1000) & (full["sat_score"] < 1000)]["School Name"]
Out[28]:
91       INTERNATIONAL COMMUNITY HIGH SCHOOL
125                                      NaN
126          BRONX INTERNATIONAL HIGH SCHOOL
139    KINGSBRIDGE INTERNATIONAL HIGH SCHOOL
141    INTERNATIONAL SCHOOL FOR LIBERAL ARTS
176                                      NaN
179            HIGH SCHOOL OF WORLD CULTURES
188       BROOKLYN INTERNATIONAL HIGH SCHOOL
225    INTERNATIONAL HIGH SCHOOL AT PROSPECT
237               IT TAKES A VILLAGE ACADEMY
253                MULTICULTURAL HIGH SCHOOL
286    PAN AMERICAN INTERNATIONAL HIGH SCHOO
Name: School Name, dtype: object
In [29]:
full.plot.scatter(x='ell_percent', y='sat_score')
Out[29]:
In [30]:
show_district_map("ell_percent")
C:\Anaconda3\lib\site-packages\folium\folium.py:504: UserWarning: This method is deprecated. Please use Map.choropleth instead.
  warnings.warn('This method is deprecated. '
C:\Anaconda3\lib\site-packages\folium\folium.py:506: FutureWarning: 'threshold_scale' default behavior has changed. Now you get a linear scale between the 'min' and the 'max' of your data. To get former behavior, use folium.utilities.split_six.
  return self.choropleth(*args, **kwargs)
Out[30]:
In [31]:
full.corr()["sat_score"][["rr_s", "rr_t", "rr_p", "N_s", "N_t", "N_p", "saf_tot_11", "com_tot_11", "aca_tot_11", "eng_tot_11"]].plot.bar()
Out[31]:
In [32]:
full.corr()["sat_score"][["white_per", "asian_per", "black_per", "hispanic_per"]].plot.bar()
Out[32]:
In [33]:
show_district_map("hispanic_per")
C:\Anaconda3\lib\site-packages\folium\folium.py:504: UserWarning: This method is deprecated. Please use Map.choropleth instead.
  warnings.warn('This method is deprecated. '
C:\Anaconda3\lib\site-packages\folium\folium.py:506: FutureWarning: 'threshold_scale' default behavior has changed. Now you get a linear scale between the 'min' and the 'max' of your data. To get former behavior, use folium.utilities.split_six.
  return self.choropleth(*args, **kwargs)
Out[33]:
In [34]:
full.corr()["sat_score"][["male_per", "female_per"]].plot.bar()
Out[34]:
In [35]:
full.plot.scatter(x='female_per', y='sat_score')
Out[35]:
In [36]:
full[(full["female_per"] > 65) & (full["sat_score"] > 1400)]["School Name"]
Out[36]:
5                    BARD HIGH SCHOOL EARLY COLLEGE
22            PROFESSIONAL PERFORMING ARTS HIGH SCH
26                    ELEANOR ROOSEVELT HIGH SCHOOL
40                     TALENT UNLIMITED HIGH SCHOOL
61             FIORELLO H. LAGUARDIA HIGH SCHOOL OF
302                     TOWNSEND HARRIS HIGH SCHOOL
342    FRANK SINATRA SCHOOL OF THE ARTS HIGH SCHOOL
Name: School Name, dtype: object
In [37]:
full["ap_avg"] = full["AP Test Takers "] / full["total_enrollment"]

full.plot.scatter(x='ap_avg', y='sat_score')
Out[37]:
In [38]:
full[(full["ap_avg"] > .3) & (full["sat_score"] > 1700)]["School Name"]
Out[38]:
26             ELEANOR ROOSEVELT HIGH SCHOOL
37                    STUYVESANT HIGH SCHOOL
151             BRONX HIGH SCHOOL OF SCIENCE
155    HIGH SCHOOL OF AMERICAN STUDIES AT LE
187           BROOKLYN TECHNICAL HIGH SCHOOL
302              TOWNSEND HARRIS HIGH SCHOOL
327    QUEENS HIGH SCHOOL FOR THE SCIENCES A
356      STATEN ISLAND TECHNICAL HIGH SCHOOL
Name: School Name, dtype: object